In [1]:
from Bio import SeqIO
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
In [2]:
# Read in the new 79 sequences.
new_seqs = [s for s in SeqIO.parse('Alaska_waterfowl_79viruses_seqs_20151223.fasta', 'fasta')]
len(new_seqs)
Out[2]:
In [3]:
len(new_seqs) / 8
Out[3]:
In [4]:
ird_data = pd.read_csv('20160104_brandt_ird.csv', na_filter=False, parse_dates=['Collection Date'])
ird_data['Host Species'] = ird_data['Host Species'].str.split(':').str[1]
ird_data['Strain Name'] = ird_data['Strain Name'].str.split('(').str[0]
ird_data['Strain Name'] = ird_data['Strain Name'].str.replace('A/American black duck/Maine/44411/532/2008', 'A/American black duck/Maine/44411_532/2008') ## this is a manual data cleanup.
ird_data['State/Province'] = ird_data['State/Province'].replace('Unknown', np.nan).replace('-N/A-', np.nan)
ird_data['Sequence Accession'] = ird_data['Sequence Accession'].str.replace('*', '')
In [14]:
len(ird_data[ird_data['State/Province'] == 'Alaska']) / 8
Out[14]:
In [6]:
new_data = pd.read_csv('Alaska_waterfowl_79viruses_metadata_20151223.csv')
new_data.columns
Out[6]:
In [7]:
segnum_name = dict()
segnum_name[1] = 'PB2'
segnum_name[2] = 'PB1'
segnum_name[3] = 'PA'
segnum_name[4] = 'HA'
segnum_name[5] = 'NP'
segnum_name[6] = 'NA'
segnum_name[7] = 'M'
segnum_name[8] = 'NS'
In [15]:
ird_seqs = [s for s in SeqIO.parse('20160104_brandt_ird.fasta', 'fasta')]
def change_id_name(s, ird_data, segnum_name):
"""
s: a BioPython seqrecord
ird_data: the data downloaded from the IRD
"""
# for s in ird_seqs:
try:
row = ird_data.loc[ird_data['Sequence Accession'] == s.id]
idx = row.index[0]
strain_name = row['Strain Name'][idx]
gene_name = segnum_name[row['Segment'][idx]]
s.id = str(strain_name) + '|' + str(gene_name)
s.id = s.id.replace(' ', '_')
except:
print(s)
return s
results = Parallel(n_jobs=-1)(delayed(change_id_name)(s, ird_data, segnum_name) for s in ird_seqs)
print(len(results) / 8)
In [16]:
# Combine the FASTA files together into one unfiltered FASTA file.
SeqIO.write(results, 'ird_seqs_name_as_accession.fasta', 'fasta')
Out[16]:
In [ ]:
In [ ]: